Load packages
library(data.table) # For faster data manipulation
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf) # For spatial data handling
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet) # For interactive maps
library(leaflet.extras) # For additional leaflet features
library(mapview) # For easier map visualization
library(tmap) # For thematic maps
library(tigris) # For US road networks
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(future) # For parallel processing
library(future.apply) # For parallel processing with apply functions
# Create directories if they don't exist
if (!dir.exists("./data/tigris")) {
dir.create("./data/tigris", recursive = TRUE)
}
# Set custom cache directory (optional)
options(tigris_cache_dir = "./data/tigris")
# Configure tigris to use caching
options(tigris_use_cache = TRUE)
# Function to create buffers in batches with proper projection
create_buffers_in_batches <- function(sf_object, buffer_dist, batch_size = 500) {
# First, reproject to an appropriate projected CRS for the region
# For California, UTM Zone 10N (EPSG:26910) or 11N (EPSG:26911) works well
# If we're unsure about your region, a web mercator projection (3857) is a reasonable default
message("Reprojecting data to a meter-based CRS...")
sf_object_projected <- st_transform(sf_object, 3310) # Web Mercator
n_features <- nrow(sf_object_projected)
n_batches <- ceiling(n_features / batch_size)
# Create empty list to store results
buffer_list <- vector("list", n_batches)
# Process in batches
for (i in 1:n_batches) {
start_idx <- (i-1) * batch_size + 1
end_idx <- min(i * batch_size, n_features)
cat(sprintf("Processing batch %d of %d (features %d to %d)\n",
i, n_batches, start_idx, end_idx))
# Extract batch
batch <- sf_object_projected[start_idx:end_idx, ]
# Create buffer (with parallel processing within sf)
buffer_list[[i]] <- st_buffer(batch, dist = buffer_dist)
}
# Combine results
result <- do.call(rbind, buffer_list)
# Reproject back to original CRS if needed
message("Reprojecting results back to original CRS...")
st_transform(result, st_crs(sf_object))
}
Load data (filter for California for now to limit data set size)
# Efficient approach
df.acc <- fread("data/us_accidents/US_accidents_March23.csv")[
# Filter date range to 2016-2021
as.Date(Start_Time) >= as.Date("2016-01-01") &
as.Date(Start_Time) <= as.Date("2021-12-31") &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = data.table::year(Start_Time),
quarter = data.table::quarter(Start_Time),
month = data.table::month(Start_Time),
# Calculate duration (assuming End_Time exists in the dataset)
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble() # Convert to tibble only at the end for performance
df.const <- fread("data/us_constructions/US_constructions_Dec21.csv")[
# Filter date range to 2016-2021
as.Date(Start_Time) >= as.Date("2016-01-01") &
as.Date(Start_Time) <= as.Date("2021-12-31") &
# And California
State == "CA"
][, `:=`(
# Add year, quarter, month columns
year = year(Start_Time),
quarter = quarter(Start_Time),
month = month(Start_Time),
# Calculate duration (assuming End_Time exists in the dataset)
duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>%
as_tibble() # Convert to tibble only at the end for performance
Convert dataframes to sf objects
# Convert accident data to sf object
df.acc.sf <- df.acc %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Convert construction data to sf object
df.const.sf <- df.const %>%
filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Get US roads (this can be slow for the entire US, so we might want to limit by state)
# First get all counties in California
ca_counties <- counties("CA", year = 2021)
# Then get roads for each county
ca_roads_list <- lapply(ca_counties$COUNTYFP, function(county_code) {
roads("CA", county = county_code, year = 2021)
})
# Optionally combine all county road data into one object
ca_roads <- do.call(rbind, ca_roads_list)
# Static map for Accidents with year coloring
# First, store your plot in a variable
ca_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.acc.sf %>% filter(State == "CA"),
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") + # Use viridis color palette for discrete values
theme_minimal() +
labs(title = "US Road Accidents by Year (2016-2021)") +
theme(legend.position = "bottom")
print(ca_plot)
# Then save it using ggsave
#ggsave(filename = "imgs/california_accidents.png",
# plot = ca_plot,
# width = 10, # width in inches
# height = 8, # height in inches
# dpi = 300) # resolution
# Construction sites map with year coloring
# First, store our plot in a variable
ca_const_plot <- ggplot() +
geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
geom_sf(data = df.const.sf %>% filter(State == "CA"),
aes(color = factor(year)), alpha = 0.7, size = 0.5) +
scale_color_viridis_d(name = "Year") + # Use viridis color palette for discrete values
theme_minimal() +
labs(title = "US Construction Sites by Year (2016-2021)") +
theme(legend.position = "bottom")
print(ca_const_plot)
# Then save it using ggsave
#ggsave(filename = "imgs/california_construction.png",
# plot = ca_const_plot,
# width = 10, # width in inches
# height = 8, # height in inches
# dpi = 300) # resolution
# For interactive maps (often better for large datasets)
# Accidents map
accident_map <- leaflet() %>%
addTiles() %>%
addHeatmap(data = df.acc.sf,
intensity = ~1,
radius = 8,
blur = 10) %>%
setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>% # Center to CA
setMaxBounds(lng1 = -124.6, lat1 = 42.0, # Northwest corner of CA
lng2 = -114.1, lat2 = 32.5) # Southeast corner of CA
# Display maps
accident_map